/*
Copyright 2008-2009 Elöd Egyed-Zsigmond, Cyril Laitang
Copyright 2009-2011 Samuel Gesche

This file is part of IPRI News Analyzer.

IPRI News Analyzer is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

IPRI News Analyzer is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with IPRI News Analyzer.  If not, see <http://www.gnu.org/licenses/>.
*/

package proc.text;

import org.apache.commons.lang.StringEscapeUtils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class XMLCleaner {
    
    public static String xmlToText(String xmlTxt) {

	Pattern pDel = Pattern.compile("(?s)<.*?>");
	Pattern pText = Pattern.compile("(.*)");
	Matcher m;
                
        // remove xml character as store texte have xmlChar
	String temp = StringEscapeUtils.unescapeXml(xmlTxt);
               
        //remove useless tags et al
	m = pDel.matcher(temp);
	temp = m.replaceAll(" ");
	temp = temp.trim();

	//capture all the other text lines for later processing since all font tags appear 
	//before they are used, the they're in memory by now.
	m = pText.matcher(temp);
        String stringFinal=""; 
	if (m.matches()) {
		stringFinal = m.group(1);
                stringFinal = stringFinal.trim();
	}
        try{
            return StringEscapeUtils.unescapeHtml(stringFinal);
        }catch(Exception e){
            System.out.println("erreur xmlClean ");
            return "Clean error";
        }
    }

    

}
